import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import missingno as mn
import re
import datetime
#import dask.dataframe as dd
sns.set(style="darkgrid")
pd.set_option('display.max_columns', 500)
def load_join(path,fs,ls):
d = pd.read_csv(path+str(fs)+".txt",encoding = "ISO-8859-1",sep=";")
for i in range(fs+1,ls):
d = d.append(pd.read_csv(path+str(i)+".txt",encoding = "ISO-8859-1",sep=";"))
return d
def not_pattern(pattern,data):
p = re.compile(pattern)
k = 0
for i in data.values:
m = p.match(i)
if not m:
k = k + 1
return k
The data used
df2 = load_join("Mitakus Data/Data",2011,2015)
df2.info()
df2.shape
df2 = df2.drop_duplicates()
df2 = df2.reset_index(drop=True)
not_pattern("\d{2}:\d{2}$",df2["Time"])
not_pattern("\d{2}\.\d{2}\.20\d{2}$",df2["Date"])
df2["DateTime"] = df2.Date+" "+df2.Time
df2["DateTime"] = pd.to_datetime(df2["DateTime"])
df2["year"] = [x.year for x in df2["DateTime"]]
df2["month"] = [x.month for x in df2["DateTime"]]
df2["day"] = [x.day for x in df2["DateTime"]]
df2["Revenue"] = df2["Revenue"].apply(lambda x : x.replace(",","."))
df2["Revenue"] = df2["Revenue"].astype(float)
df2["NetTotal"] = df2["NetTotal"].apply(lambda x : x.replace(",","."))
df2["NetTotal"] = df2["NetTotal"].astype(float)
df2["StandardPrice"] = df2["StandardPrice"].apply(lambda x : x.replace(",","."))
df2["StandardPrice"] = df2["StandardPrice"].astype(float)
df2 = df2.drop(["CostUnit","PriceList","CurrencyFlag","ValidityCard"],axis=1)
Checkpoint :
df2.to_csv("Mitakus Data/2011-2014.csv",index=False)
coln = pd.read_csv("Mitakus Data/Data2015.txt",encoding = "ISO-8859-1",sep=";").columns
df = pd.read_csv("Mitakus Data/Data2015.txt",encoding = "ISO-8859-1",sep=";").append(pd.read_csv("Mitakus Data/Data2016.txt",encoding = "ISO-8859-1",sep=";",names=coln)).append(pd.read_csv("Mitakus Data/Data2017.txt",encoding = "ISO-8859-1",sep=";",index_col=False)).append(pd.read_csv("Mitakus Data/Data2018.txt",encoding = "ISO-8859-1",sep=";",index_col=False))
df.head()
df.info()
df.shape
df.columns
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df = df.drop(["CostUnit","PriceList","CurrencyFlag","ValidityCard"],axis=1)
df.shape
not_pattern("\d{2}:\d{2}$",df["Time"])
not_pattern("\d{2}\.\d{2}\.20\d{2}$",df["Date"])
df["DateTime"] = df.Date+" "+df.Time
df["DateTime"] = pd.to_datetime(df["DateTime"])
df["year"] = [x.year for x in df["DateTime"]]
df["month"] = [x.month for x in df["DateTime"]]
df["day"] = [x.day for x in df["DateTime"]]
df["Revenue"] = df["Revenue"].apply(lambda x : x.replace(",","."))
df["Revenue"] = df["Revenue"].astype(float)
df["NetTotal"] = df["NetTotal"].apply(lambda x : x.replace(",","."))
df["NetTotal"] = df["NetTotal"].astype(float)
df["StandardPrice"] = df["StandardPrice"].apply(lambda x : x.replace(",","."))
df["StandardPrice"] = df["StandardPrice"].astype(float)
Checkpoint :
df.to_csv("Mitakus Data/2015-2018.csv",index=False)
df = pd.read_csv("Mitakus Data/2011-2014.csv"
,dtype={"Date":"category"
,"Time":"category"
,"CashPoint":"category"
,"ReceiptNumber":"int64"
,"Cashier":"category"
,"PaymentType":"category"
,"CardID":"category"
,"Firm":"category"
,"Department":"category"
,"SubventionLevel":"category"
,"PositionType":"category"
,"ItemNumber":"category"
,"Taxes":"category"
,"Amount":"category"
,"Revenue":"float"
,"ItemDescription":"category"
,"NetTotal":"float"
,"StandardPrice":"float"
,"KeyCardCounter":"int64"
,"Weight":"category"
,"DateTime":"object"
,"year":"int64"
,"month":"int64"
,"day":"int64"},parse_dates=["DateTime"]).append(
pd.read_csv("Mitakus Data/2015-2018.csv"
,dtype={"Date":"category"
,"Time":"category"
,"CashPoint":"category"
,"ReceiptNumber":"int64"
,"Cashier":"category"
,"PaymentType":"category"
,"CardID":"category"
,"Firm":"category"
,"Department":"category"
,"SubventionLevel":"category"
,"PositionType":"category"
,"ItemNumber":"category"
,"Taxes":"category"
,"Amount":"category"
,"Revenue":"float"
,"ItemDescription":"category"
,"NetTotal":"float"
,"StandardPrice":"float"
,"KeyCardCounter":"int64"
,"Weight":"category"
,"DateTime":"object"
,"year":"int64"
,"month":"int64"
,"day":"int64"},parse_dates=["DateTime"]))
Checkpoint :
df.to_csv("Mitakus Data/2011-2018.csv",index=False)
df = pd.read_csv("Mitakus Data/2011-2018.csv"
,dtype={"Date":"category"
,"Time":"category"
,"CashPoint":"category"
,"ReceiptNumber":"int64"
,"Cashier":"category"
,"PaymentType":"category"
,"CardID":"category"
,"Firm":"category"
,"Department":"category"
,"SubventionLevel":"category"
,"PositionType":"category"
,"ItemNumber":"category"
,"Taxes":"category"
,"Amount":"category"
,"Revenue":"float"
,"ItemDescription":"category"
,"NetTotal":"float"
,"StandardPrice":"float"
,"KeyCardCounter":"int64"
,"Weight":"category"
,"DateTime":"object"
,"year":"int64"
,"month":"int64"
,"day":"int64"},parse_dates=["DateTime"])
df.info()
df["year"].isin([2011,2012,2013,2014,2015,2016,2017,2018]).value_counts()
df = df[df.year.isin([2011,2012,2013,2014,2015,2016,2017,2018])]
184 different Cashpoint :
len(set(df.CashPoint))
df.CashPoint.isin(["1","2","11","12","13","14"]).value_counts()
df = df[df.CashPoint.isin(["1","2","11","12","13","14"])]
df.shape
len(set(df.CashPoint))
df["Date"] = df["Date"].astype("str")
df["Time"] = df["Time"].astype("str")
df["DateTime"] = df.Date+" "+df.Time
df["DateTime"] = pd.to_datetime(df["DateTime"],dayfirst=True)
df["year"] = [x.year for x in df["DateTime"]]
df["month"] = [x.month for x in df["DateTime"]]
df["day"] = [x.day for x in df["DateTime"]]
from pandas.api.types import CategoricalDtype
cat_type = CategoricalDtype(categories=[ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], ordered=True)
df["dayoftheweek"] = df.DateTime.dt.weekday_name.astype(cat_type)
df = df.sort_values(by="DateTime",ascending=True)
df = df.reset_index(drop=True)
df["location"] = ["Lehel" if x in ["1","2"] else "Giesing" for x in df.CashPoint]
df.to_csv("Mitakus Data/2011-2018_f.csv",index=False)
df = pd.read_csv("Mitakus Data/2011-2018_f.csv"
,dtype={"Date":"category"
,"Time":"category"
,"CashPoint":"category"
,"ReceiptNumber":"int64"
,"Cashier":"category"
,"PaymentType":"category"
,"CardID":"category"
,"Firm":"category"
,"Department":"category"
,"SubventionLevel":"category"
,"PositionType":"category"
,"ItemNumber":"category"
,"Taxes":"category"
,"Amount":"category"
,"Revenue":"float"
,"ItemDescription":"category"
,"NetTotal":"float"
,"StandardPrice":"float"
,"KeyCardCounter":"int64"
,"Weight":"category"
,"DateTime":"object"
,"year":"int64"
,"month":"int64"
,"day":"int64"
,"dayoftheweek":"category"
,"location":"category"},parse_dates=["DateTime"])
from pandas.api.types import CategoricalDtype
cat_type = CategoricalDtype(categories=[ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], ordered=True)
df["dayoftheweek"] = df.DateTime.dt.weekday_name.astype(cat_type)
for i in df.columns:
if df[i].dtype.type == pd.core.dtypes.dtypes.CategoricalDtypeType:
df[i] = df[i].cat.remove_unused_categories()
df.info()
sns.catplot(x="CashPoint",col="year",data=df, kind="count")
There is a big difference between transactions in cashPoints 1,2 (lehel) and 11,12,13,14 (Geising) from which we can understand that guests in Geising are more than employees in lehel but we still ignore the reason behind the small amount of transactions in cashPoint 14 (Geising).
g = sns.catplot(x="PaymentType",col="year",data=df,hue="location",sharey=False ,kind="count",legend=False)
pd.Series(g.fig.get_axes()).apply(lambda x : x.legend());
round(df["PaymentType"].value_counts()*100/len(df),2)
99.29% of the transactions are made by CardID
g = sns.catplot(x="Firm",col="year",data=df,hue="location",sharey=False ,kind="count",legend=False)
pd.Series(g.fig.get_axes()).apply(lambda x : x.legend());
percent=round(df["Firm"].value_counts()*100/len(df),2)
percent["11"]+percent["99"]
FirmCount is observable only if the transaction is made by cardID, Knowing that 99.29% of the transactions are done by cardID we can extract useful information from FirmCount such as the custmers of the canteen are over 84.76% employees and 15.34 % are guests.
g = sns.catplot(x="PositionType",col="year",data=df,hue="location",sharey=False ,kind="count",legend=False)
pd.Series(g.fig.get_axes()).apply(lambda x : x.legend());
round(df["PositionType"].value_counts()*100/len(df),2)
over 99.74% of the transactions are items sold at the cash register
len(set(df["ItemDescription"]))
plt.figure(figsize=(12,6))
sns.barplot(y=df["ItemDescription"].value_counts().sort_values(ascending=False).head(20).index.tolist()
,x=df["ItemDescription"].value_counts().sort_values(ascending=False).head(20).values)
f , axes = plt.subplots(nrows=1, ncols=2,figsize=(30,7))
sns.barplot(y=df[df["location"]=="Lehel"]["ItemDescription"].value_counts().sort_values(ascending=False).head(20).index.tolist()
,x=df[df["location"]=="Lehel"]["ItemDescription"].value_counts().sort_values(ascending=False).head(20).values
,ax=axes[0])
axes[0].set_title("Lehel",fontsize=30)
sns.barplot(y=df[df["location"]=="Giesing"]["ItemDescription"].value_counts().sort_values(ascending=False).head(20).index.tolist()
,x=df[df["location"]=="Giesing"]["ItemDescription"].value_counts().sort_values(ascending=False).head(20).values
,ax=axes[1])
axes[1].set_title("Giesing",fontsize=30);
f , axes = plt.subplots(nrows=1, ncols=8,figsize=(110,6))
l = 0
for yy in [2011,2012,2013,2014,2015,2016,2017,2018]:
g = sns.barplot(x="ItemNumber",y="ItemDescription"
,data=df[df["year"] == yy][["ItemDescription","ItemNumber"]].groupby("ItemDescription").count().reset_index().sort_values(by="ItemNumber",ascending=False)
,order=df[df["year"] == yy][["ItemDescription","ItemNumber"]].groupby("ItemDescription").count().reset_index().sort_values(by="ItemNumber",ascending=False)["ItemDescription"].head(20)
,ax=axes[l])
axes[l].set_title(yy)
l = l + 1
g = sns.catplot(y="ItemNumber"
,x="ItemDescription"
,col="year"
,data=df[["ItemDescription","year","ItemNumber"]].groupby(["ItemDescription","year"]).count().reset_index()
,order=df["ItemDescription"].value_counts().sort_values(ascending=False).head(30).index, kind="bar")
g.set_axis_labels("ItemDescription","Number of item sold")
g.set_xticklabels(rotation=90)
f , axes = plt.subplots(nrows=2, ncols=8,figsize=(60,7.5))
l = 0
for yy in [2011,2012,2013,2014,2015,2016,2017,2018]:
g = sns.barplot(x="ItemNumber",y="ItemDescription"
,data=df[(df["year"] == yy) & (df["location"] == "Lehel")][["ItemDescription","ItemNumber"]].groupby("ItemDescription").count().reset_index().sort_values(by="ItemNumber",ascending=False)
,order=df[(df["year"] == yy) & (df["location"] == "Lehel")][["ItemDescription","ItemNumber"]].groupby("ItemDescription").count().reset_index().sort_values(by="ItemNumber",ascending=False)["ItemDescription"].head(15)
,ax=axes[0][l])
axes[0][l].set_title("Lehel "+str(yy))
g = sns.barplot(x="ItemNumber",y="ItemDescription"
,data=df[(df["year"] == yy) & (df["location"] == "Giesing")][["ItemDescription","ItemNumber"]].groupby("ItemDescription").count().reset_index().sort_values(by="ItemNumber",ascending=False)
,order=df[(df["year"] == yy) & (df["location"] == "Giesing")][["ItemDescription","ItemNumber"]].groupby("ItemDescription").count().reset_index().sort_values(by="ItemNumber",ascending=False)["ItemDescription"].head(15)
,ax=axes[1][l])
axes[1][l].set_title("Giesing "+str(yy))
l = l + 1
f.tight_layout()
The top 20 sold items over all years are as shown in these plots Beilage a 0.45, O-saft frisch to Beilage a 0.75 and over all years items like Beilage a 0.45, O-saft frisch ,Breze 0.65 , salat port 0.60 and salatTeller were always top 5 best items except for 2011 when Gelb 3.30 was sold more than salatTeller and for 2017 and 2018 when Grun 1.99 was the new top sold item. Also top 5 items over all years always kept their same rank even if they lose their rank it can't be more than one place at worse but there is other items like BeiwerkTeller which was in the 7th place in 2011 and in the 6th place in 2012 and then it kept losing its rank until getting out of the top20 solt items list. We might explain the high rank of breze (Pretzel) and the semmel (bread roles) by the german food culture which is considered as a great baking culture, and there are lots of different variations of bread to choose from.
f , axes = plt.subplots(nrows=1, ncols=12,figsize=(180,6))
l = 0
for yy in range(1,13):
g = sns.barplot(x="ItemNumber",y="ItemDescription"
,data=df[df["month"] == yy][["ItemDescription","ItemNumber"]].groupby("ItemDescription").count().reset_index().sort_values(by="ItemNumber",ascending=False)
,order=df[df["month"] == yy][["ItemDescription","ItemNumber"]].groupby("ItemDescription").count().reset_index().sort_values(by="ItemNumber",ascending=False)["ItemDescription"].head(20)
,ax=axes[l])
axes[l].set_title(yy)
l = l + 1
f , axes = plt.subplots(nrows=2, ncols=12,figsize=(80,7.5))
l = 0
for yy in range(1,13):
g = sns.barplot(x="ItemNumber",y="ItemDescription"
,data=df[(df["month"] == yy) & (df["location"] == "Lehel")][["ItemDescription","ItemNumber"]].groupby("ItemDescription").count().reset_index().sort_values(by="ItemNumber",ascending=False)
,order=df[(df["month"] == yy) & (df["location"] == "Lehel")][["ItemDescription","ItemNumber"]].groupby("ItemDescription").count().reset_index().sort_values(by="ItemNumber",ascending=False)["ItemDescription"].head(15)
,ax=axes[0][l])
axes[0][l].set_title("Lehel "+str(yy))
g = sns.barplot(x="ItemNumber",y="ItemDescription"
,data=df[(df["month"] == yy) & (df["location"] == "Giesing")][["ItemDescription","ItemNumber"]].groupby("ItemDescription").count().reset_index().sort_values(by="ItemNumber",ascending=False)
,order=df[(df["month"] == yy) & (df["location"] == "Giesing")][["ItemDescription","ItemNumber"]].groupby("ItemDescription").count().reset_index().sort_values(by="ItemNumber",ascending=False)["ItemDescription"].head(15)
,ax=axes[1][l])
axes[1][l].set_title("Giesing "+str(yy))
l = l + 1
f.tight_layout()
There is not a big difference in the rank of the top sold items over months which can tell that visually there is not a remarkble influence of the seasons and months on the behavior of the custemers in this canteen
f , axes = plt.subplots(nrows=1, ncols=7,figsize=(100,6))
l = 0
for yy in df.dayoftheweek.unique().sort_values().tolist():
g = sns.barplot(x="ItemNumber",y="ItemDescription"
,data=df[df["dayoftheweek"] == yy][["ItemDescription","ItemNumber"]].groupby("ItemDescription").count().reset_index().sort_values(by="ItemNumber",ascending=False)
,order=df[df["dayoftheweek"] == yy][["ItemDescription","ItemNumber"]].groupby("ItemDescription").count().reset_index().sort_values(by="ItemNumber",ascending=False)["ItemDescription"].head(20)
,ax=axes[l])
axes[l].set_title(yy)
l = l + 1
f , axes = plt.subplots(nrows=2, ncols=7,figsize=(60,7.5))
l = 0
for yy in df.dayoftheweek.unique().sort_values().tolist():
g = sns.barplot(x="ItemNumber",y="ItemDescription"
,data=df[(df["dayoftheweek"] == yy) & (df["location"] == "Lehel")][["ItemDescription","ItemNumber"]].groupby("ItemDescription").count().reset_index().sort_values(by="ItemNumber",ascending=False)
,order=df[(df["dayoftheweek"] == yy) & (df["location"] == "Lehel")][["ItemDescription","ItemNumber"]].groupby("ItemDescription").count().reset_index().sort_values(by="ItemNumber",ascending=False)["ItemDescription"].head(15)
,ax=axes[0][l])
axes[0][l].set_title("Lehel "+str(yy))
g = sns.barplot(x="ItemNumber",y="ItemDescription"
,data=df[(df["dayoftheweek"] == yy) & (df["location"] == "Giesing")][["ItemDescription","ItemNumber"]].groupby("ItemDescription").count().reset_index().sort_values(by="ItemNumber",ascending=False)
,order=df[(df["dayoftheweek"] == yy) & (df["location"] == "Giesing")][["ItemDescription","ItemNumber"]].groupby("ItemDescription").count().reset_index().sort_values(by="ItemNumber",ascending=False)["ItemDescription"].head(15)
,ax=axes[1][l])
axes[1][l].set_title("Giesing "+str(yy))
l = l + 1
f.tight_layout()
There are items that are more or least sold than others from a day to another but the top 5 best sold items are always on top of all items over all days.
plt.figure(figsize=(12,6))
sns.barplot(y=df["ItemDescription"].value_counts().sort_values(ascending=True).head(20).index.tolist(),x=df["ItemDescription"].value_counts().sort_values(ascending=True).head(20).values)
f , axes = plt.subplots(nrows=1, ncols=8,figsize=(110,6))
l = 0
for yy in [2011,2012,2013,2014,2015,2016,2017,2018]:
g = sns.barplot(x="ItemNumber",y="ItemDescription"
,data=df[df["year"] == yy][["ItemDescription","ItemNumber"]].groupby("ItemDescription",observed=True).count().reset_index().sort_values(by="ItemNumber",ascending=True)
,order=df[df["year"] == yy][["ItemDescription","ItemNumber"]].groupby("ItemDescription",observed=True).count().reset_index().sort_values(by="ItemNumber",ascending=True)["ItemDescription"].head(20)
,ax=axes[l])
axes[l].set_title(yy)
l = l + 1
g = sns.catplot(x="ItemNumber"
,y="ItemDescription"
,col="year"
,data=df[["ItemDescription","year","ItemNumber"]].groupby(["ItemDescription","year"]).count().reset_index()
,order=df["ItemDescription"].value_counts().sort_values(ascending=True).head(15).index, kind="bar")
g.set_axis_labels("Number of item sold", "ItemDescription")
f , axes = plt.subplots(nrows=2, ncols=8,figsize=(60,7.5))
l = 0
for yy in [2011,2012,2013,2014,2015,2016,2017,2018]:
g = sns.barplot(x="ItemNumber",y="ItemDescription"
,data=df[(df["year"] == yy) & (df["location"] == "Lehel")][["ItemDescription","ItemNumber"]].groupby("ItemDescription",observed=True).count().reset_index().sort_values(by="ItemNumber",ascending=True)
,order=df[(df["year"] == yy) & (df["location"] == "Lehel")][["ItemDescription","ItemNumber"]].groupby("ItemDescription",observed=True).count().reset_index().sort_values(by="ItemNumber",ascending=True)["ItemDescription"].head(15)
,ax=axes[0][l])
axes[0][l].set_title("Lehel "+str(yy))
g = sns.barplot(x="ItemNumber",y="ItemDescription"
,data=df[(df["year"] == yy) & (df["location"] == "Giesing")][["ItemDescription","ItemNumber"]].groupby("ItemDescription",observed=True).count().reset_index().sort_values(by="ItemNumber",ascending=True)
,order=df[(df["year"] == yy) & (df["location"] == "Giesing")][["ItemDescription","ItemNumber"]].groupby("ItemDescription",observed=True).count().reset_index().sort_values(by="ItemNumber",ascending=True)["ItemDescription"].head(15)
,ax=axes[1][l])
axes[1][l].set_title("Giesing "+str(yy))
l = l + 1
f.tight_layout()
set(df["SubventionLevel"])
plt.figure(figsize=(10,4))
sns.barplot(y=df[["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True).sum().sort_values(by="Revenue",ascending=False).head(10).index.tolist()
,x=df[["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True)["Revenue"].sum().sort_values(ascending=False).head(10).values)
f , axes = plt.subplots(nrows=1, ncols=2,figsize=(20,7))
sns.barplot(y=df[df["location"]=="Lehel"][["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True).sum().sort_values(by="Revenue",ascending=False).head(15).index.tolist()
,x=df[df["location"]=="Lehel"][["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True)["Revenue"].sum().sort_values(ascending=False).head(15).values
,ax=axes[0])
axes[0].set_title("Lehel",fontsize=20)
sns.barplot(y=df[df["location"]=="Giesing"][["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True).sum().sort_values(by="Revenue",ascending=False).head(15).index.tolist()
,x=df[df["location"]=="Giesing"][["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True)["Revenue"].sum().sort_values(ascending=False).head(15).values
,ax=axes[1])
axes[1].set_title("Giesing",fontsize=20);
As we can notice from this revenue item barplot, the salatTeller and Beiwerk Teller take to top two items and the the manu Grun(green) 1.99 take position three followed by the menu Gelb( yellow) with the same value about 500000. We might say as hypothisis that the guest are interested in salads, sides and in the three types of menus which are rot,grun and gelb.
f , axes = plt.subplots(nrows=1, ncols=8,figsize=(100,6))
l = 0
for yy in [2011,2012,2013,2014,2015,2016,2017,2018]:
g = sns.barplot(x="Revenue",y="ItemDescription"
,data=df[df["year"] == yy][["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True).sum().reset_index()
,order=df[df["year"] == yy][["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True).sum().reset_index().sort_values(by="Revenue",ascending=False)["ItemDescription"].head(20)
,ax=axes[l])
axes[l].set_title(yy)
l = l + 1
g = sns.catplot(x="Revenue"
,y="ItemDescription"
,col="year"
,data=df[["ItemDescription","Revenue","year"]].groupby(["ItemDescription","year"],observed=True).sum().reset_index()
,order=df[["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True).sum().sort_values(by="Revenue",ascending=False).head(20).index.tolist(), kind="bar")
g.set_axis_labels("Revenue", "ItemDescription")
f , axes = plt.subplots(nrows=2, ncols=8,figsize=(60,7.5))
l = 0
for yy in [2011,2012,2013,2014,2015,2016,2017,2018]:
g = sns.barplot(x="Revenue",y="ItemDescription"
,data=df[(df["year"] == yy) & (df["location"] == "Lehel")][["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True).sum().reset_index()
,order=df[(df["year"] == yy) & (df["location"] == "Lehel")][["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True).sum().reset_index().sort_values(by="Revenue",ascending=False)["ItemDescription"].head(15)
,ax=axes[0][l])
axes[0][l].set_title("Lehel "+str(yy))
g = sns.barplot(x="Revenue",y="ItemDescription"
,data=df[(df["year"] == yy) & (df["location"] == "Giesing")][["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True).sum().reset_index()
,order=df[(df["year"] == yy) & (df["location"] == "Giesing")][["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True).sum().reset_index().sort_values(by="Revenue",ascending=False)["ItemDescription"].head(15)
,ax=axes[1][l])
axes[1][l].set_title("Giesing "+str(yy))
l = l + 1
f.tight_layout()
f , axes = plt.subplots(nrows=2, ncols=12,figsize=(80,7.5))
l = 0
for yy in range(1,13):
g = sns.barplot(x="Revenue",y="ItemDescription"
,data=df[(df["month"] == yy) & (df["location"] == "Lehel")][["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True).sum().reset_index()
,order=df[(df["month"] == yy) & (df["location"] == "Lehel")][["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True).sum().reset_index().sort_values(by="Revenue",ascending=False)["ItemDescription"].head(15)
,ax=axes[0][l])
axes[0][l].set_title("Lehel "+str(yy))
g = sns.barplot(x="Revenue",y="ItemDescription"
,data=df[(df["month"] == yy) & (df["location"] == "Giesing")][["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True).sum().reset_index()
,order=df[(df["month"] == yy) & (df["location"] == "Giesing")][["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True).sum().reset_index().sort_values(by="Revenue",ascending=False)["ItemDescription"].head(15)
,ax=axes[1][l])
axes[1][l].set_title("Giesing "+str(yy))
l = l + 1
f.tight_layout()
f , axes = plt.subplots(nrows=2, ncols=7,figsize=(60,7.5))
l = 0
for yy in df.dayoftheweek.unique().sort_values().tolist():
g = sns.barplot(x="Revenue",y="ItemDescription"
,data=df[(df["dayoftheweek"] == yy) & (df["location"] == "Lehel")][["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True).sum().reset_index()
,order=df[(df["dayoftheweek"] == yy) & (df["location"] == "Lehel")][["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True).sum().reset_index().sort_values(by="Revenue",ascending=False)["ItemDescription"].head(15)
,ax=axes[0][l])
axes[0][l].set_title("Lehel "+str(yy))
g = sns.barplot(x="Revenue",y="ItemDescription"
,data=df[(df["dayoftheweek"] == yy) & (df["location"] == "Giesing")][["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True).sum().reset_index()
,order=df[(df["dayoftheweek"] == yy) & (df["location"] == "Giesing")][["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True).sum().reset_index().sort_values(by="Revenue",ascending=False)["ItemDescription"].head(15)
,ax=axes[1][l])
axes[1][l].set_title("Giesing "+str(yy))
l = l + 1
f.tight_layout()
f , axes = plt.subplots(nrows=1, ncols=8,figsize=(110,6))
l = 0
for yy in [2011,2012,2013,2014,2015,2016,2017,2018]:
g = sns.barplot(x="Revenue",y="ItemDescription"
,data=df[(df["year"] == yy) &(df["Revenue"] < 0)][["ItemDescription","Revenue","year"]].groupby(["ItemDescription","year"],observed=True).count().sort_values(by="Revenue",ascending=False).reset_index()
,order=df[(df["year"] == yy) &(df["Revenue"] < 0)][["ItemDescription","Revenue","year"]].groupby(["ItemDescription","year"],observed=True).count().sort_values(by="Revenue",ascending=False).reset_index().ItemDescription.head(15)
,ax=axes[l])
axes[l].set_title(yy)
l = l + 1
g = sns.catplot(y="ItemDescription"
,x="Revenue"
,col="year"
,order = df[df["Revenue"] < 0][["ItemDescription","Revenue"]].groupby("ItemDescription",observed=True).count().sort_values(by="Revenue",ascending=False).reset_index()["ItemDescription"].head(10)
,data=df[df["Revenue"] < 0][["ItemDescription","Revenue","year"]].groupby(["ItemDescription","year"],observed=True).count().sort_values(by="Revenue",ascending=False).reset_index()
,kind="bar")
g.set_axis_labels("Number of item refunded", "ItemDescription")
f , axes = plt.subplots(nrows=2, ncols=8,figsize=(60,7.5))
l = 0
for yy in [2011,2012,2013,2014,2015,2016,2017,2018]:
g = sns.barplot(x="Revenue",y="ItemDescription"
,data=df[(df["year"] == yy) & (df["location"] == "Lehel") &(df["Revenue"] < 0)][["ItemDescription","Revenue","year"]].groupby(["ItemDescription","year"],observed=True).count().sort_values(by="Revenue",ascending=False).reset_index()
,order=df[(df["year"] == yy) & (df["location"] == "Lehel") &(df["Revenue"] < 0)][["ItemDescription","Revenue","year"]].groupby(["ItemDescription","year"],observed=True).count().sort_values(by="Revenue",ascending=False).reset_index().ItemDescription.head(10)
,ax=axes[0][l])
axes[0][l].set_title("Lehel "+str(yy))
g = sns.barplot(x="Revenue",y="ItemDescription"
,data=df[(df["year"] == yy) & (df["location"] == "Giesing") &(df["Revenue"] < 0)][["ItemDescription","Revenue","year"]].groupby(["ItemDescription","year"],observed=True).count().sort_values(by="Revenue",ascending=False).reset_index()
,order=df[(df["year"] == yy) & (df["location"] == "Giesing") &(df["Revenue"] < 0)][["ItemDescription","Revenue","year"]].groupby(["ItemDescription","year"],observed=True).count().sort_values(by="Revenue",ascending=False).reset_index().ItemDescription.head(10)
,ax=axes[1][l])
axes[1][l].set_title("Giesing "+str(yy))
l = l + 1
f.tight_layout()
f , axes = plt.subplots(nrows=2, ncols=12,figsize=(80,7.5))
l = 0
for yy in range(1,13):
g = sns.barplot(x="Revenue",y="ItemDescription"
,data=df[(df["month"] == yy) & (df["location"] == "Lehel") &(df["Revenue"] < 0)][["ItemDescription","Revenue","month"]].groupby(["ItemDescription","month"],observed=True).count().sort_values(by="Revenue",ascending=False).reset_index()
,order=df[(df["month"] == yy) & (df["location"] == "Lehel") &(df["Revenue"] < 0)][["ItemDescription","Revenue","month"]].groupby(["ItemDescription","month"],observed=True).count().sort_values(by="Revenue",ascending=False).reset_index().ItemDescription.head(10)
,ax=axes[0][l])
axes[0][l].set_title("Lehel "+str(yy))
g = sns.barplot(x="Revenue",y="ItemDescription"
,data=df[(df["month"] == yy) & (df["location"] == "Giesing") &(df["Revenue"] < 0)][["ItemDescription","Revenue","month"]].groupby(["ItemDescription","month"],observed=True).count().sort_values(by="Revenue",ascending=False).reset_index()
,order=df[(df["month"] == yy) & (df["location"] == "Giesing") &(df["Revenue"] < 0)][["ItemDescription","Revenue","month"]].groupby(["ItemDescription","month"],observed=True).count().sort_values(by="Revenue",ascending=False).reset_index().ItemDescription.head(10)
,ax=axes[1][l])
axes[1][l].set_title("Giesing "+str(yy))
l = l + 1
f.tight_layout()
sns.countplot(x="Taxes",data=df)
About 7000000 sold items have 19% taxes and about 2500000 have 7% taxes.
round(df["Taxes"].value_counts()*100/len(df),2)
73.39% of the products taxes are 19%, 24.75% of the product taxes are 24.75% and only 1.85% of the products are free from taxes
df[df.DateTime.dt.date == datetime.date(2016,4,2)].DateTime.dt.strftime("%H").unique()
f,axes = plt.subplots(nrows=3,ncols=1,figsize=(7,8))
df["CardID"].groupby(df.DateTime.dt.to_period("Y")).count().plot(ax=axes[0],title="per year")
df["CardID"].groupby(df.DateTime.dt.to_period("M")).count().plot(ax=axes[1],title="per month")
df["CardID"].groupby(df.DateTime.dt.to_period("D")).count().plot(ax=axes[2],title="per day")
f.tight_layout()
In the first graphic of the time series per year, we can see the remarkable decrease in the number of the guests in 2013 and then a massive decrease 2017 where the number become under 120000 guests which stayed stabled. The decrease might also be related the Europeen Economic Crisis.
In the second graph, we can observe a high variablity between the years of 2014 and 2016.
As the third graph, which is a time serie of the number of guest per the days in the years, we observe that there are this sudden regular decreases which are mostly explained by the days in the weekends.
f,axes = plt.subplots(nrows=3,ncols=2,figsize=(14,8))
df[df["location"] == "Lehel"]["CardID"].groupby(df.DateTime.dt.to_period("Y")).count().plot(ax=axes[0][0],title="Lehel : per year")
df[df["location"] == "Lehel"]["CardID"].groupby(df.DateTime.dt.to_period("M")).count().plot(ax=axes[1][0],title="Lehel : per month")
df[df["location"] == "Lehel"]["CardID"].groupby(df.DateTime.dt.to_period("D")).count().plot(ax=axes[2][0],title="Lehel : per day")
df[df["location"] == "Giesing"]["CardID"].groupby(df.DateTime.dt.to_period("Y")).count().plot(ax=axes[0][1],title="Giesing : per year")
df[df["location"] == "Giesing"]["CardID"].groupby(df.DateTime.dt.to_period("M")).count().plot(ax=axes[1][1],title="Giesing : per month")
df[df["location"] == "Giesing"]["CardID"].groupby(df.DateTime.dt.to_period("D")).count().plot(ax=axes[2][1],title="Giesing : per day")
f.tight_layout()
According to the time series graphs of Lehel number of guest, the top number of guest were in 2011 about 270000 guest and it surged in 2014 to reach 260000 and it escalated to 240000 in 2016 then rebound in 2017 to fall again in 2018.
According to the time series graphs of Giesing number of guest, the number of guests are much higher which reached 105000 then slowly start droping to rise again between 2014 and 2015 than escalated in 2017.
Counting each CardID as unique in the same day :
f,axes = plt.subplots(nrows=3,ncols=1,figsize=(7,8))
df[["year","month","day","CardID"]].groupby(["year","month","day"]).nunique().drop(["year","month","day"],axis=1).reset_index().groupby("year").sum()["CardID"].plot(ax=axes[0],title="per year")
df[["year","month","day","CardID"]].groupby([df.DateTime.dt.date,"year","month","day"]).nunique().drop(["year","month","day"],axis=1).reset_index().groupby(["year","month"]).sum()["CardID"].plot(ax=axes[1],title="per month")
df[["year","month","day","CardID"]].groupby(["year","month","day"]).nunique().drop(["year","month","day"],axis=1)["CardID"].plot(ax=axes[2],title="per day")
f.tight_layout()
f,axes = plt.subplots(nrows=3,ncols=2,figsize=(14,8))
df[df["location"] == "Lehel"][["year","month","day","CardID"]].groupby(["year","month","day"]).nunique().drop(["year","month","day"],axis=1).reset_index().groupby("year").sum()["CardID"].plot(ax=axes[0][0],title="Lehel : per year")
df[df["location"] == "Lehel"][["year","month","day","CardID"]].groupby([df.DateTime.dt.date,"year","month","day"]).nunique().drop(["year","month","day"],axis=1).reset_index().groupby(["year","month"]).sum()["CardID"].plot(ax=axes[1][0],title="Lehel : per month")
df[df["location"] == "Lehel"][["year","month","day","CardID"]].groupby(["year","month","day"]).nunique().drop(["year","month","day"],axis=1)["CardID"].plot(ax=axes[2][0],title="Lehel : per day")
df[df["location"] == "Giesing"][["year","month","day","CardID"]].groupby(["year","month","day"]).nunique().drop(["year","month","day"],axis=1).reset_index().groupby("year").sum()["CardID"].plot(ax=axes[0][1],title="Giesing : per year")
df[df["location"] == "Giesing"][["year","month","day","CardID"]].groupby([df.DateTime.dt.date,"year","month","day"]).nunique().drop(["year","month","day"],axis=1).reset_index().groupby(["year","month"]).sum()["CardID"].plot(ax=axes[1][1],title="Giesing : per month")
df[df["location"] == "Giesing"][["year","month","day","CardID"]].groupby(["year","month","day"]).nunique().drop(["year","month","day"],axis=1)["CardID"].plot(ax=axes[2][1],title="Giesing : per day")
f.tight_layout()
The number of guests is decreasing from 2011 to 2018 specially in 2013 by over 80 000 employees and again in 2017.
plt.figure(figsize=[14,6])
ax = sns.boxplot(x="year"
,y="CardID"
,data=df[["year","month","CardID"]].groupby(["year","month"]).count().reset_index())
The median of the boxplots during the years are similar, this means that there is a certain periodicity. we can conclude that there is a low variability in the number of guests .
plt.figure(figsize=[14,6])
ax = sns.boxplot(x="year"
,y="CardID"
,data=df[["year","month","location","CardID"]].groupby(["location","year","month"]).count().reset_index()
,hue="location")
The number of guests as we see here are always higher is Giesing than Lehel.
plt.figure(figsize=[14,6])
ax = sns.boxplot(x="month"
,y="CardID"
,data=df[["year","month","CardID"]].groupby(["year","month"]).count().reset_index())
We can notice that Novembre is the most croweded month by guests by a median that is between 110000 and 120000 followed by October that also reachs as a maximum number of guests over 120000. December is lowest one in number of guest which might be explained by the New years Eve vaccation. August is also low but that's because most canteens are closed since most employees go on vaccation. The average number of guests in the rest of the months of the year are between 95000 and 115000. we can conclude that there is a certain variablity of number of guests .
plt.figure(figsize=[14,6])
ax = sns.boxplot(x="month"
,y="CardID"
,data=df[["year","month","CardID","location"]].groupby(["location","year","month"]).count().reset_index()
,hue="location")
The number of guests in december is low compared to other months in Giesing and Lehel.
sns.catplot(data=df[["year","month","CardID"]].groupby(["year","month"]).count().reset_index()
,x="month"
,y="CardID"
,col="year",kind="point")
g = sns.catplot(data=df[["year","month","CardID","location"]].groupby(["location","year","month"]).count().reset_index()
,x="month"
,y="CardID"
,col="year"
,hue="location"
,kind="point")
plt.figure(figsize=[50,8])
ax = sns.boxplot(x="year"
,y="CardID"
,hue="month"
,data=df[["year","month","day","CardID"]].groupby(["year","month","day"]).count().reset_index())
plt.figure(figsize=(10,6))
sns.heatmap(df.pivot_table(values="CardID",index="year",columns="month",aggfunc="count"), cmap="YlGnBu")
The matrix where we have as colomn the year and as row the month the number of guests and it's very obvious that the density is pretty much lower in 2017 and 2018 comparing to the previous years and december is caracterized by a slow density of customers in every year, which matches our previous analysis.
sns.catplot(data=df[["year","CardID","dayoftheweek"]].groupby(["year","dayoftheweek"]).count().reset_index()
,x="dayoftheweek"
,y="CardID"
,col="year",kind="point")
sns.catplot(data=df[["year","CardID","dayoftheweek","location"]].groupby(["location","year","dayoftheweek"]).count().reset_index()
,x="dayoftheweek"
,y="CardID"
,col="year"
,hue="location",kind="point")
plt.figure(figsize=[50,8])
ax = sns.boxplot(x="year"
,y="CardID"
,hue="dayoftheweek"
,data=df[["year","month","dayoftheweek","CardID"]].groupby(["year","month","dayoftheweek"]).count().reset_index())
f,axes = plt.subplots(nrows=3,ncols=1,figsize=(7,8))
df["Revenue"].groupby(df.DateTime.dt.to_period("Y")).sum().plot(ax=axes[0],title="per year")
df["Revenue"].groupby(df.DateTime.dt.to_period("M")).sum().plot(ax=axes[1],title="per month")
df["Revenue"].groupby(df.DateTime.dt.to_period("D")).sum().plot(ax=axes[2],title="per day")
f.tight_layout()
f,axes = plt.subplots(nrows=3,ncols=2,figsize=(14,8))
df[df["location"] == "Lehel"]["Revenue"].groupby(df.DateTime.dt.to_period("Y")).sum().plot(ax=axes[0][0],title="Lehel : per year")
df[df["location"] == "Lehel"]["Revenue"].groupby(df.DateTime.dt.to_period("M")).sum().plot(ax=axes[1][0],title="Lehel : per month")
df[df["location"] == "Lehel"]["Revenue"].groupby(df.DateTime.dt.to_period("D")).sum().plot(ax=axes[2][0],title="Lehel : per day")
df[df["location"] == "Giesing"]["Revenue"].groupby(df.DateTime.dt.to_period("Y")).sum().plot(ax=axes[0][1],title="Giesing : per year")
df[df["location"] == "Giesing"]["Revenue"].groupby(df.DateTime.dt.to_period("M")).sum().plot(ax=axes[1][1],title="Giesing : per month")
df[df["location"] == "Giesing"]["Revenue"].groupby(df.DateTime.dt.to_period("D")).sum().plot(ax=axes[2][1],title="Giesing : per day")
f.tight_layout()
sns.catplot(data=df[["Revenue","year","dayoftheweek"]].groupby(["year","dayoftheweek"]).sum().sort_index().reset_index()
,x="dayoftheweek"
,y="Revenue"
,col="year",kind="point")
sns.catplot(data=df[["location","Revenue","year","dayoftheweek"]].groupby(["location","year","dayoftheweek"]).sum().sort_index().reset_index()
,x="dayoftheweek"
,y="Revenue"
,col="year"
,hue="location",kind="point")
plt.figure(figsize=[60,8])
ax = sns.boxplot(x="year"
,y="Revenue"
,hue="dayoftheweek"
,data=df[["year","month","dayoftheweek","Revenue"]].groupby(["year","month","dayoftheweek"]).count().reset_index())
sns.catplot(data=df[["Revenue","month","year"]].groupby(["year","month"]).sum().sort_index().reset_index()
,x="month"
,y="Revenue"
,col="year",kind="point")
sns.catplot(data=df[["location","Revenue","month","year"]].groupby(["location","year","month"]).sum().sort_index().reset_index()
,x="month"
,y="Revenue"
,col="year"
,hue="location",kind="point")
plt.figure(figsize=[60,8])
ax = sns.boxplot(x="year"
,y="Revenue"
,hue="month"
,data=df[["year","month","dayoftheweek","Revenue"]].groupby(["year","month","dayoftheweek"]).count().reset_index())
Even if the number of custumers is decreasing over the years the revenu of the canteen kept growing recording a small decrease in 2017 and then started increasing again in 2018 to reach almost 2 000 000. This might be explained by either the rise of the price of certains sold items or the increesed taxes, it could be also explained the guest buy more items by individuals.
plt.figure(figsize=(10,6))
sns.heatmap(df.pivot_table(values="Revenue",index="year",columns="month",aggfunc=np.sum), cmap="YlGnBu")
g = sns.catplot(y="Revenue"
,x="CashPoint"
,col="year"
,data=df[["Revenue","year","CashPoint"]].groupby(["year","CashPoint"],observed=True).sum().reset_index()
,kind="bar")
g.set_xticklabels(rotation=90)
Cash points 11,12 and 13 are the top 3 best income cash points from 2011 to 2018 and the revenu from geising canteen is over 5 times the revenu of lehel canteen. Cash point 12 from 2011 to 2013 was the top income cashpoint until cashPoint 11 took its place in 2014. As a conclusion, Geising with its four cashpoints has a better income over the years than Lehel with its two cashpoints.
from xlrd import open_workbook , XLRDError
import os
from IPython.display import clear_output
import locale
locale.setlocale(locale.LC_TIME,"de")
def executeItemCleaningTask(df,start,stop):
matchStat = pd.DataFrame(columns=['canteen','week','dayOfWeek','year','month','day','oldItem','newItem','match'])
df["ItemDescription"]=df['ItemDescription'].astype('str')
for year in range(start,stop+1):
matchStat=matchProcess(df,'Mitakus Data/menu/'+str(year),matchStat)
print(year,' : done')
return matchStat
def matchProcess(df,folder_path,matchStat):
year= re.findall(r'\d+',folder_path)[0] #get the year from the path
#Match week by week
for path, dirs, files in os.walk(folder_path):
for filename in files:
matchStat=match(year,df,folder_path+'/'+filename,filename,matchStat)
return matchStat
def match(year,df,name,filename,matchStat):
location = filename.split('.')[1]
week = location[:2]
canteen = location[2:]
try :
wb = open_workbook(name, formatting_info=True)
if(NoPrivateEvent(wb)) : #check if there is a special Event
sheet = wb.sheet_by_index(2)
menu = getMenu(year,week,sheet,wb)
matchStat=menuMatch(menu,df,canteen,year,week,matchStat)
except XLRDError as e:
print(filename,' : ',e)
return matchStat
def NoPrivateEvent(wb):
testSheet = wb.sheet_by_index(0)
if testSheet.cell(1,1).value == 'Casino Giesing Speisekarte':
return True
elif testSheet.cell(1,1).value == 'Casino Lehel Speisekarte':
return True
return False
def menuMatch(menu,df,canteen,year,week,matchStat):
liveMatch=0
TotalItems=0
for dayofweek,day,month,item in menu:
TotalItems+=1
if len(df.loc[(df["location"] == canteen)&(df["year"]==int(year))&(df["month"]==month)&(df["day"]==day)&(df["ItemDescription"].str.contains(item)) , 'ItemDescription'])!=0 :
df.loc[(df["location"] == canteen)&(df["year"]==int(year))&(df["month"]==month)&(df["day"]==day)&(df["ItemDescription"].str.contains(item)) , 'ItemDescription']=menu[(dayofweek,day,month,item)]
matchStat=matchStat.append({'canteen':canteen,'week':week,'dayOfWeek':dayofweek,'year':year,'month':month,'day':day,'oldItem':item,'newItem':menu[(dayofweek,day,month,item)],'match':1},ignore_index=True)
liveMatch+=1
elif 'BIO' in item :
item2=re.sub('BIO ','',item)
if len(df.loc[(df["location"] == canteen)&(df["year"]==int(year))&(df["month"]==month)&(df["day"]==day)&(df["ItemDescription"].str.contains(item2)) , 'ItemDescription'])!=0 :
df.loc[(df["location"] == canteen)&(df["year"]==int(year))&(df["month"]==month)&(df["day"]==day)&(df["ItemDescription"].str.contains(item2)) , 'ItemDescription']=menu[(dayofweek,day,month,item)]
matchStat=matchStat.append({'canteen':canteen,'week':week,'dayOfWeek':dayofweek,'year':year,'month':month,'day':day,'oldItem':item2,'newItem':menu[(dayofweek,day,month,item)],'match':1},ignore_index=True)
liveMatch+=1
else :
matchStat=matchStat.append({'canteen':canteen,'week':week,'dayOfWeek':dayofweek,'year':year,'month':month,'day':day,'oldItem':item,'newItem':'','match':0},ignore_index=True)
else :
matchStat=matchStat.append({'canteen':canteen,'week':week,'dayOfWeek':dayofweek,'year':year,'month':month,'day':day,'oldItem':item,'newItem':'','match':0},ignore_index=True)
if(TotalItems!=0):
liveProgress='Canteen : '+canteen+' Year : '+year+' Week : '+week+' Match Accuracy : '+str(round((liveMatch*100)/TotalItems,2))+'% \n'
clear_output()
print(liveProgress)
return matchStat
def getMenu(year,week,sheet,wb):
Days=['Montag','Dienstag','Mittwoch','Donnerstag','Freitag','Samstag','Sonntag']
menu={}
skip=False
date=''
for column in range(1,sheet.ncols-1):
for row in range(sheet.nrows):
cell = sheet.cell(row,column)
color = getCellColor(row,column+1,sheet,wb)
# if we detect a new day
if(cell.value in Days):
#reset parameters and update the date
day=cell.value
date,cut,skip,greenItem,yellowItem,redItem=checkPoint(day,week,year)
# if we detect a text cell
if(skip==False):
if cell.ctype==1:
if color in [11,10,13] : #if we detect a color
greenItem,yellowItem,redItem=colorDetection(color,greenItem,yellowItem,redItem,cell.value)
cut=True
else :
cut=False #don't cut at the next price
else:
skip=False
# now if we detect a price we assign to it the previous items or item
if(cell.ctype == 2 and cut == True):
price=str(cell.value)
price=price.replace('.',',')
if greenItem != '' :
greenItem,skip=checkNextCell(row,column,11,greenItem,sheet,wb) #check the next cell
greenItem=cleanItemTxt(greenItem) #clean the item
menu[(day,date.day,date.month,colorDecoder(11,greenItem)+' '+price)] = greenItem #load the Menu
greenItem='' #reset item
elif yellowItem != '' :
yellowItem,skip=checkNextCell(row,column,13,yellowItem,sheet,wb) #check the next cell
yellowItem=cleanItemTxt(yellowItem) #clean the item
menu[(day,date.day,date.month,colorDecoder(13,yellowItem)+' '+price)] = yellowItem #load the Menu
yellowItem='' #reset item
elif redItem != '' :
redItem,skip=checkNextCell(row,column,10,redItem,sheet,wb) #check the next cell
redItem=cleanItemTxt(redItem) #clean the item
menu[(day,date.day,date.month,colorDecoder(10,redItem)+' '+price)] = redItem #load the Menu
redItem='' #reset item
return menu
def checkNextCell(row,column,color,Item,sheet,wb):
#check the next cell
itemCheck = sheet.cell(row+1, column)
colorCheck = getCellColor(row+1, column+1,sheet,wb)
if(itemCheck.ctype == 1 and colorCheck == color):
Item+=' und '+itemCheck.value
skip=True # skip one row
else :
skip=False
return Item,skip
def getCellColor(row,column,sheet,wb):
cell = sheet.cell(row, column)
fmt = wb.xf_list[cell.xf_index]
return fmt.background.pattern_colour_index
def checkPoint(day,week,year):
date=updateDate(year,week,day)
return date,False,False,'','',''
def updateDate(year,week,day):
return datetime.datetime.strptime(year+'-'+week+'-'+day, "%Y-%W-%A")
def colorDetection(color,greenItem,yellowItem,redItem,newItem):
if color==11 : # if we detect a green cell
greenItem+=newItem+' '
elif color==13 : # if we detect a yellow cell
yellowItem+=newItem+' '
elif color==10 : # if we detect a red cell
redItem+=newItem+' '
return greenItem,yellowItem,redItem
def cleanItemTxt(item):
item=item.replace('\n',' ') #remove \n
item=re.sub(' +', ' ',item) #reduce spaces
item=re.sub('- +', '-',item) #remove space after '-'
return item
def colorDecoder(code,item):
if(code == 11):
color = "Grün"
elif(code == 13):
color = "Gelb"
elif(code == 10):
color = "Rot"
if(("bio" )in item.lower()):
color+=" BIO"
return color
matchStat=executeItemCleaningTask(df,2016,2018)
matchStat.to_csv('matchStat.csv' , index=False)
df[df['year']>2015].to_csv('cleanData16-18.csv',index=False)
cleanData16_18=pd.read_csv('cleanData16-18.csv')
cleanData16_18=cleanData16_18.drop(['Date','Time','SubventionLevel'] , axis = 1)
cleanData16_18.to_csv('cleanData16-18.csv',index=False)
matchStat=pd.read_csv('matchStat.csv')
plt.figure(figsize=(6,5))
sns.countplot(data=matchStat ,x='match')
perM=round(len(matchStat[matchStat['match']==1])*100/len(matchStat),2)
print('Matched Items percentage from 2016 to 2018 : ')
print(' Matched Items Frequency : ',perM,'%')
print(' No Matched Items Frequency : ',round(100-perM,2),'%')
plt.figure(figsize=(10,5))
sns.countplot(data=matchStat ,x='year' , hue='match')
perM16=round(len(matchStat[(matchStat['year']==2016) & (matchStat['match']==1)])*100/len(matchStat[matchStat['year']==2016]),2)
perM17=round(len(matchStat[(matchStat['year']==2017) & (matchStat['match']==1)])*100/len(matchStat[matchStat['year']==2017]),2)
perM18=round(len(matchStat[(matchStat['year']==2018) & (matchStat['match']==1)])*100/len(matchStat[matchStat['year']==2018]),2)
print('Matched Items percentage for 2016 : ')
print(' Matched Items Frequency : ',perM16,'%')
print(' No Matched Items Frequency : ',round(100-perM16,2),'%')
print('Matched Items percentage for 2017 : ')
print(' Matched Items Frequency : ',perM17,'%')
print(' No Matched Items Frequency : ',round(100-perM17,2),'%')
print('Matched Items percentage for 2018 : ')
print(' Matched Items Frequency : ',perM18,'%')
print(' No Matched Items Frequency : ',round(100-perM18,2),'%')
plt.figure(figsize=(7,5))
sns.countplot(data=matchStat ,x='canteen' , hue='match')
perG=round(len(matchStat[(matchStat['canteen']=='Giesing') & (matchStat['match']==1)])*100/len(matchStat[matchStat['canteen']=='Giesing']),2)
perL=round(len(matchStat[(matchStat['canteen']=='Lehel') & (matchStat['match']==1)])*100/len(matchStat[matchStat['canteen']=='Lehel']),2)
print('Matched Items percentage for Giesing from 2016 to 2018 : ')
print(' Matched Items Frequency : ',perG,'%')
print(' No Matched Items Frequency : ',round(100-perG,2),'%')
print('Matched Items percentage for Lehel from 2016 to 2018 : ')
print(' Matched Items Frequency : ',perL,'%')
print(' No Matched Items Frequency : ',round(100-perL,2),'%')
plt.figure(figsize=[30,5])
sns.countplot(data=matchStat[matchStat['year']==2016] ,x='week' , hue='match')
plt.figure(figsize=[30,5])
sns.countplot(data=matchStat[matchStat['year']==2017] ,x='week' , hue='match')
plt.figure(figsize=[30,5])
sns.countplot(data=matchStat[matchStat['year']==2018] ,x='week' , hue='match')